# ------------------- Local Protest in Israel ------------------- #
#   Module:           helper for GLM analysis
#   Specifically:     Packages
#   Author:           Karsten
# --------------------------------------------------------------- # 

# function to get the mode for factors and mean for numeric
getmode <- function(v) {
  levels(v)[which.max(table(v))]
}

# get best model results
get_best_result = function(caret_fit) {
  best = which(rownames(caret_fit$results) == rownames(caret_fit$bestTune))
  best_result = caret_fit$results[best, ]
  rownames(best_result) = NULL
  best_result
}

# calculate F1 score
f1 <- function(data, lev = NULL, model = NULL) {
  f1_val <- MLmetrics::F1_Score(y_pred = data$pred,
                                y_true = data$obs,
                                positive = lev[1])
  c(F1 = f1_val)
}

# Auxilliary functions
# - generate nice sequence
prettySeq <- function(x) paste("Resample", gsub(" ", "0", format(seq(along = x))), sep = "")
# - generate subsample
subsample <- function(dat, p) {
  if (nrow(dat) == 1) {
    out <- dat$index
  }
  else {
    num <- ceiling(nrow(dat) * p)
    out <- sample(dat$index, size = num)
  }
  out
}

# Function that generates a custom of CreateDataPartition for this special use case
# - scope conditions
#   panel data for N=189 localities, each for the same number of weeks
# - assumption
#   1. Ensure that classes are balanced based on time-aggregate characteristics
#     specifically, separate localities by quantiles of aggregate incidence
#     ensure that each quantile equally represented in test and train splits
#   2. Keep locality level data strictly intact
#     All observations from one locality are either in the train or the test set, never in both
#     Assumptions ensures that temporal structure is basically taken out, just comparison over (repeated) observations between localities
library(plyr)

CreateLocalityPartition <- function(y, locality, times = 1, p = 0.5, groups = min(5,length(y))){
  # Safe original inputs
  pred <- cbind(1:length(y),locality,y)
  
  # First separate localities by overall incidence
  # - aggregate the prediction dimension by locality
  pred.agg <- aggregate(as.numeric(y), by=list(locality=locality), FUN=sum)
  y.agg <- pred.agg[, 2]
  # - generate quantiles from observations
  y.agg <- cut(y.agg, unique(quantile(y.agg, probs = seq(0, 1, length = groups))), include.lowest = TRUE)
  # - sample separately for each quantile (and repeat as many times as necessary for k-fold)
  out <- list()
  for (j in 1:times){
    tmp <- plyr::dlply(data.frame(y = y.agg, index = seq(along = y.agg)), 
                 .(y.agg), subsample, p = p)
    tmp <- sort(as.vector(unlist(tmp)))
    out[[j]] <- tmp
  }
  out <- matrix(unlist(out), ncol = times)
  colnames(out) <- prettySeq(1:ncol(out))
  
  # - pull out the index of all corresponding observations
  train <- pred[pred[,2] %in% out,1]

  return(train)
}